/*
 * Routines for dealing with the map
 */
#include <sys/types.h>
#include <netinet/in.h>

#include "libfma.h"
#include "lf_fabric.h"
#include "lf_topo_map.h"
#include "lf_fms_comm.h"
#include "lf_fma_flags.h"
#include "lf_lag.h"
#include "libmyri.h"

#include "fma.h"
#include "fma_fms.h"
#include "fma_fabric.h"
#include "fma_myri.h"
#include "fma_map.h"
#include "fma_dfs_route.h"
#include "fma_standalone.h"
#include "fma_probe.h"
#include "fma_verify.h"
#include "fma_tunnel.h"

/*
 * Local prototypes
 */
static void fma_alloc_route_buffers(struct lf_fabric *fp);
static void fma_clear_routes(struct lf_fabric *fp);
static void fma_dist_map_done(void *v);
static void fma_dist_map_failed(void *v);
static void fma_next_map_sends(void);

/*
 * Initialize map info
 */
void
fma_init_map_info_vars()
{
  struct fma_map_info *mip;

  LF_CALLOC(A.map_info, struct fma_map_info, 1);
  mip = A.map_info;

  mip->first_mapping = TRUE;		/* no map received yet */

  /* some static message hearders */
  mip->topo_map_hdr.type_16 = htons(FMA_PACKET_TYPE);
  mip->topo_map_hdr.subtype_16 = htons(FMA_SUBTYPE_TOPO_MAP);

  return;

 except:
  fma_perror_exit(1);
}


/*
 * We got a new map, translate it into a fabric struct
 * and then do some validation.
 */
int
fma_copy_and_load_map(
  struct lf_topo_map *topo_map_in,
  int topo_map_size)
{
  int rc;
  struct lf_topo_map *topo_map;
  lf_string_t why;

  /* Make a copy of the topo map */
  topo_map = (struct lf_topo_map *) malloc(topo_map_size);
  if (topo_map == NULL) LF_ERROR(("Error allocating topo_map buffer"));
  memcpy(topo_map, topo_map_in, topo_map_size);

  /* Load this map struct */
  fma_load_map_struct(topo_map, topo_map_size);

  /* validate the map */
  rc = fma_validate_map(why);

  /* map is OK, route it */
  if (rc == 0) {
    rc = fma_route_topo_map();

    /* print any errors */
    if (rc != 0) {
      fma_perror();
    }

    if (A.debug > 1) fma_dumpit();

  /* map is not valid */
  } else {
    fma_map_is_invalid(TRUE, why);
  }
  return rc;

 except:
  fma_perror_exit(1);
  return -1;
}

/*
 * Release the current topo map
 */
void
fma_release_topo_map()
{
  /* Cancel any map distribution in progress since the map is going away */
  fma_cancel_map_distribution();

  LF_FREE(A.map_info->current_topo_map);
}

/*
 * Turn a map file into a fabric struct
 */
void
fma_load_map_struct(
  struct lf_topo_map *topo_map,
  int topo_map_size)
{
  struct lf_fabric *fp;
  int mapper_level;
  struct fma_map_info *mip;
  int rc;

  /* Release the current topo map */
  fma_release_topo_map();

  /* Set the new topo map */
  mip = A.map_info;
  mip->current_topo_map = topo_map;
  mip->current_topo_map_size = topo_map_size;
  mip->mi_most_ports = 0;		/* smallest supported */
  mip->first_mapping = FALSE;		/* we have received first map */

  /* get pointer to map holder */
  lf_free_fabric(A.fabric);
  fp = fma_create_fabric();
  A.fabric = fp;

  fma_log("  %d hosts, %d nics, %d xbars, %d links",
    ntohl(topo_map->num_hosts_32), ntohl(topo_map->num_nics_32),
    ntohl(topo_map->num_xbars_32), ntohl(topo_map->num_links_32));
  fma_log("map version is %d", ntohl(topo_map->map_id_32));

  /* copy over counts for xbars and NICs */
  fp->num_hosts = ntohl(topo_map->num_hosts_32);
  fp->num_xbars = ntohl(topo_map->num_xbars_32);
  FMA_FABRIC(fp)->num_nics = ntohl(topo_map->num_nics_32);
  FMA_FABRIC(fp)->num_links = ntohl(topo_map->num_links_32);

  /* allocate space for xbars and NICs */
  LF_CALLOC(fp->hosts, struct lf_host *, fp->num_hosts);
  LF_CALLOC(fp->xbars, struct lf_xbar *, fp->num_xbars);
  LF_CALLOC(FMA_FABRIC(fp)->nics, struct lf_nic *, FMA_FABRIC(fp)->num_nics);
  LF_CALLOC(FMA_FABRIC(fp)->link_end_node, union lf_node *,
            FMA_FABRIC(fp)->num_links);
  LF_CALLOC(FMA_FABRIC(fp)->link_end_port, char, FMA_FABRIC(fp)->num_links);

  /* Allocate all the hosts */
  {
    struct lf_host *hp;
    char *hnp;
    int h;

    hnp = LF_TOPO_HOSTNAME_ARRAY(topo_map);

    for (h=0; h<fp->num_hosts; ++h) {
      hp = fma_create_host();
      if (hp == NULL) LF_ERROR(("Error allocating host"));
      fp->hosts[h] = hp;

      hp->host_index = h;	/* assign host index */
      LF_DUP_STRING(hp->hostname, hnp);
      hnp += strlen(hnp)+1;
    }
  }

  /* copy over all the xbars */
  {
    struct lf_topo_xbar *topo_xbar;
    int x;

    topo_xbar = LF_TOPO_XBAR_ARRAY(topo_map);

    for (x=0; x<fp->num_xbars; ++x) {
      struct lf_xbar *xp;

      xp = fma_create_xbar(ntohs(topo_xbar->num_ports_16),
	                   ntohl(topo_xbar->xbar_id_32),
			   topo_xbar->quadrant_disable_8);
      if (xp == NULL) LF_ERROR(("Creating xbar"));

      fp->xbars[x] = xp;

      /* fill in stuff from map */
      xp->clos_level = topo_xbar->clos_level_8;
      FMA_XBAR(xp)->node_id = ntohl(topo_xbar->node_id_32);
      xp->x_topo_index = x;

      ++topo_xbar;
    }
  }
  
  /* copy over all the NICs */
  {
    struct lf_topo_nic *topo_nic;
    unsigned char *lap;
    int nic_ports;
    int n;
    int h;

    topo_nic = LF_TOPO_NIC_ARRAY(topo_map);
    nic_ports = 0;		/* remember most ports on any NIC */

    for (n=0; n<FMA_FABRIC(fp)->num_nics; ++n) {
      struct lf_nic *nicp;
      struct lf_host *hp;

      nicp = fma_create_nic(topo_nic->num_ports_8, topo_nic->mac_addr,
	  topo_nic->firmware_type_8);
      if (nicp == NULL) LF_ERROR(("Creating nic"));

      FMA_FABRIC(fp)->nics[n] = nicp;
      nicp->n_topo_index = n;
      FMA_NIC(nicp)->nic_node_id = ntohl(topo_nic->node_id_32);

      /* use node ID to get index of host */
      h = LF_TOPO_NIC_HOST_ID(FMA_NIC(nicp)->nic_node_id);
      hp = fp->hosts[h];
      rc = lf_add_existing_nic_to_host(nicp, hp);
      if (rc != 0) LF_ERROR(("Error adding NIC to host"));

      nicp->host_nic_id = LF_TOPO_NIC_HOST_NIC_ID(FMA_NIC(nicp)->nic_node_id);
      hp->fma_flags = ntohs(topo_nic->flags_16);
      hp->fw_type = topo_nic->firmware_type_8;
      hp->subfabric_mask = ntohl(topo_nic->subfab_mask_32);
      hp->distributor = ntohs(topo_nic->map_distributor_16);
      nicp->partition = ntohs(topo_nic->partition_16);
      nicp->mag_id = ntohs(topo_nic->mag_id_16);
      nicp->lag_id_index = ntohs(topo_nic->lag_id_index_16);

      /* If lag_id_index is zero, this means no lag */
      if (nicp->lag_id_index != 0) {
	lap = LF_TOPO_LAG_ID(topo_map, nicp->lag_id_index);
	lf_lag_copy(nicp->nic_lag_id, lap);
      }

      /* Make sure "max_mac_addr" is up-to-date for owning host */
      if (LF_MAC_CMP(nicp->mac_addr, FMA_HOST(hp)->max_mac_addr) > 0) {
	int ni;

	LF_MAC_COPY( FMA_HOST(hp)->max_mac_addr, nicp->mac_addr);

	/* Look for us in the map */
	for (ni=0; ni<A.myri->num_nics; ++ni) {
	  if (LF_MAC_CMP(A.myri->nic_info[ni]->myri_info.mac_addr,
		         nicp->mac_addr) == 0) {
	    A.my_host = hp;
	  }
	}
      }

      if (nicp->num_ports > nic_ports) {
	nic_ports = nicp->num_ports;
      }

      ++topo_nic;
    }

    FMA_FABRIC(fp)->max_nic_ports = nic_ports;	/* for allocating route space */
  }

  /* allocate space for routes */
  fma_alloc_route_buffers(fp);

  /* Now, make all the links */
  {
    struct lf_topo_link *topo_link;
    lf_topo_link_state_t *topo_link_state;
    int which;
    int vni;
    int l;

    topo_link = LF_TOPO_LINK_ARRAY(topo_map);
    topo_link_state = LF_TOPO_LINK_STATE_ARRAY(topo_map);

    for (l=0; l<FMA_FABRIC(fp)->num_links; ++l) {
      union lf_node *np[2];
      int port[2];
      int i;

      /* get pointers to the two nodes */
      for  (i=0; i<2; ++i) {
	int node_index;

	node_index = ntohl(topo_link->node_index_32[i]);

	if (LF_TOPO_NI_XBAR(node_index)) {
	  np[i] = LF_NODE(fp->xbars[LF_TOPO_NI_INDEX(node_index)]);
	} else {
	  np[i] = LF_NODE(FMA_FABRIC(fp)->nics[LF_TOPO_NI_INDEX(node_index)]);
	}
	port[i] = LF_TOPO_NI_PORT(node_index);
      }

      /* get verifier */
      vni = ntohl(topo_link->verifier_32);
      which = LF_TOPO_NI_WHICH_END(vni);

      /* link them to each other */
      for  (i=0; i<2; ++i) {
	if (np[i]->ln_type == LF_NODE_XBAR) {
	  struct lf_xbar *xp;
	  xp = LF_XBAR(np[i]);

	  xp->topo_ports[port[i]] = np[1-i];
	  xp->topo_rports[port[i]] = port[1-i];

	  /* If there is a verifier and this is the right end, save the info */
	  if (vni != 0 && which == i) {
	    xp->verifiers[port[i]].ver_nicp =
	      FMA_FABRIC(fp)->nics[LF_TOPO_NI_INDEX(vni)];
	    xp->verifiers[port[i]].ver_port = LF_TOPO_NI_PORT(vni);
	  }

	} else {
	  struct lf_nic *nicp;
	  nicp = LF_NIC(np[i]);

	  nicp->topo_ports[port[i]] = np[1-i];
	  nicp->topo_rports[port[i]] = port[1-i];
	}
      }

      /* set the link state */
      FMA_FABRIC(fp)->link_end_node[l] = np[0];
      FMA_FABRIC(fp)->link_end_port[l] = port[0];
      lf_set_node_link_state(np[0], port[0], *topo_link_state);

      ++topo_link;
      ++topo_link_state;
    }
  }

  /* Truncate "num_ports" on each xbar to make everything go a little faster */
  {
    int x;
    int p;

    for (x=0; x<fp->num_xbars; ++x) {
      struct lf_xbar *xp;
      int xbar_ports;

      xp = fp->xbars[x];

      /* only modify num_ports for anonymous xbars */
      if (xp->xbar_id == 0) {

	for (p=xp->num_ports-1; p>0; --p) {
	  if (xp->topo_ports[p] != NULL) {
	    break;
	  }
	}
	xp->num_ports = p+1;
	xbar_ports = 16;	/* anonymous xbars really have 16 ports */

      /* tagged xbars have accurate port counts */
      } else {
	xbar_ports = xp->num_ports;
      }

      /* keep track of most ports seen */
      if (xbar_ports > mip->mi_most_ports) {
        mip->mi_most_ports = xbar_ports;
      }
    }
  }

  fma_find_nic_indices();	/* update NIC indices */
  fma_calc_xbar_dists();	/* calc distance from each xbar to this host */

  /* save mapper info */
  mip->mi_map_version = ntohl(topo_map->map_id_32);
  mapper_level = ntohs(topo_map->mapper_level_16);
  fma_set_mapper(topo_map->mapper_mac_addr, mapper_level);

#ifdef FM_TEST_LAG
  {
    struct lf_host *hp;
    struct lf_nic *nicp;
    int h;
    int n;

    for (h=0; h<fp->num_hosts; ++h) {
      hp = fp->hosts[h];

      for (n=0; n<hp->num_nics; ++n) {
	nicp = hp->nics[n];

	if (hp->fma_flags & FMA_FLAG_CAN_DO_LAG) {
	  printf("%s, LAG_ID=\"%s\", mag_id=%d\n",
	      fma_mac_to_hostname(nicp->mac_addr),
	      lf_lag_id_string(nicp->nic_lag_id),
	      nicp->mag_id);
	} else {
	  printf("%s cannot LAG, mag_id=%d\n",
	      fma_mac_to_hostname(nicp->mac_addr), nicp->mag_id);
	}
      }
    }
  }
#endif
  return;

 except:
  fma_perror_exit(1);
}

/*
 * We got a new link state from the FMS, update our map with the new states
 */
void
fma_load_link_state(
  lf_topo_link_state_t *topo_link_state)
{
  struct lf_fabric *fp;
  lf_topo_link_state_t *lsp;
  int l;

  /* get pointer to fabric */
  fp = A.fabric;

  /* Copy this new link state into our topo map */
  lsp = LF_TOPO_LINK_STATE_ARRAY(A.map_info->current_topo_map);
  memcpy(lsp, topo_link_state, FMA_FABRIC(fp)->num_links);

  /* set the link state for each link */
  for (l=0; l<FMA_FABRIC(fp)->num_links; ++l) {

    lf_set_node_link_state(FMA_FABRIC(fp)->link_end_node[l],
		           FMA_FABRIC(fp)->link_end_port[l],
		           topo_link_state[l]);
  }

  /* xbar distances may have changed */
  fma_calc_xbar_dists();	/* calc distance from each xbar to this host */
}

/*
 * Allocate all route buffers
 */
static void
fma_alloc_route_buffers(
  struct lf_fabric *fp)
{
  int n;
  int size;
  int num_lens;
  struct lf_nic *nicp;

  num_lens = FMA_NUM_ROUTE_INDICES;
  size = num_lens * FMA_IFC_ROUTE_LEN;

  for (n=0; n<FMA_FABRIC(fp)->num_nics; ++n) {
    nicp = FMA_FABRIC(fp)->nics[n];
    LF_CALLOC(FMA_NIC(nicp)->route_buf, unsigned char, size);
    LF_CALLOC(FMA_NIC(nicp)->route_lens, int8_t, num_lens);
  }

  /* routes start cleared */
  fma_clear_routes(fp);

  return;

 except:
  fma_perror_exit(1);
}

/*
 * Clear all routes
 */
static void
fma_clear_routes(
  struct lf_fabric *fp)
{
  int n;
  int num_lens;
  struct lf_nic *nicp;

  num_lens = FMA_NUM_ROUTE_INDICES;

  for (n=0; n<FMA_FABRIC(fp)->num_nics; ++n) {
    nicp = FMA_FABRIC(fp)->nics[n];
    memset(FMA_NIC(nicp)->route_lens, -1, num_lens);
  }
}


/*
 * Find a NIC index given mac address
 */
int
fma_mac_to_nic_index(
  lf_mac_addr_t mac)
{
  struct lf_fabric *fp;
  int n;

  fp = A.fabric;
  for (n=0; n<FMA_FABRIC(fp)->num_nics; ++n) {
    if (memcmp(mac, FMA_FABRIC(fp)->nics[n]->mac_addr, sizeof(lf_mac_addr_t)) == 0) {
      return n;
    }
  }
  return -1;
}

/*
 * Find our NICs in this map
 * This will also create an lf_nic struct for each of our NICs
 * that is disconnected so that nip->nic_ptr is always valid.
 */
void
fma_find_nic_indices()
{
  struct fma_myri *mp;
  struct lf_fabric *fp;
  struct lf_host *hp;
  struct lf_nic **nics;
  struct lf_nic *nicp;
  int n;
  int nni;		/* next NIC index */

  fp = A.fabric;
  mp = A.myri;

  hp = A.my_host;
  if (hp->num_nics != mp->num_nics) {
    LF_CALLOC(nics, struct lf_nic *, mp->num_nics);
    nni = hp->num_nics;
  } else {
    nics = NULL;
    nni = 0;
  }

  /* find each NIC in map */
  for (n=0; n<mp->num_nics; ++n) {
    struct fma_nic_info *nip;
    
    nip = mp->nic_info[n];
    nip->ni_map_index = fma_mac_to_nic_index(nip->myri_info.mac_addr);

    if (nip->ni_map_index == -1) {
      fma_log("Cannot find NIC %d (" LF_MAC_FORMAT ") in map!",
	nip->nic_id, LF_MAC_ARGS(nip->myri_info.mac_addr));

      /* allocate a NIC struct for this NIC */
      nicp = fma_create_nic(nip->myri_info.num_ports, nip->myri_info.mac_addr,
	  		    hp->fw_type);
      if (nicp == NULL) {
	LF_ERROR(("Error allocating struct for NIC %d", n));
      }
      nicp->host = hp;
      nicp->host_nic_id = n;
      nicp->slot = nni;
      FMA_NIC(nicp)->local_nip = nip;

      nip->nic_ptr = nicp;
      nics[nni] = nicp;
      ++nni;
    } else {
      nicp = FMA_FABRIC(fp)->nics[nip->ni_map_index];

      /* mark NIC as local to this host */
      FMA_NIC(nicp)->local_nip = nip;

      fma_log("Found NIC %d at index %d!", nip->nic_id, nip->ni_map_index);
      nip->nic_ptr = nicp;

      /* copy this nic pointer into new nics array, if needed */
      if (nics != NULL) {
	nics[nicp->slot] = nicp;
      }
    }
  }

  /* if we had to allocate more NICs, fix up the host struct now */
  if (nics != NULL) {
    LF_FREE(hp->nics);
    hp->nics = nics;
  }
  return;

 except:
  fma_perror_exit(1);
}

/*
 * Generate routes for the current topo map
 */
int
fma_route_topo_map()
{
  int num_bad;

  /* Validate our quick routes.  Re-calculate if any are bad */
  num_bad = fma_check_quick_routes();
  if (A.debug) fma_log("%d quick routes need to be re-calculated", num_bad);

  /* recalculate quick_routes if needed */
  if (num_bad > 0) {
    fma_calc_all_quick_routes();
  }

  /* clear current routes */
  fma_clear_routes(A.fabric);

  /* perform real route calculation. */
  fma_dfs_calc_all_routes(A.myri->nic_info[0]->myri_info.num_routes);

  /* load the routes into firmware */
  fma_set_all_routes();

  return 0;
}

/*
 * validate map
 */
int
fma_validate_map(
  char *why)
{
  struct fma_myri *fmp;
  struct lf_fabric *fp;
  int n;

  fp = A.fabric;

  fmp = A.myri;
  for (n=0; n<fmp->num_nics; ++n) {
    struct fma_nic_info *nip;
    struct lf_nic *nicp;

    nip = fmp->nic_info[n];

    /* If this NIC is in map, make sure FW type is correct */
    if (nip->ni_map_index != -1) {

      nicp = FMA_FABRIC(fp)->nics[nip->ni_map_index];

      /* Check that the map has correct firmware type */
      if (FMA_NIC(nicp)->fw_type != myri_firmware_type()) {
	sprintf(why, "My firmware type is incorrect in map");
	return -1;
      }

      /* Check that the map has correct flags */
      if (nicp->host->fma_flags != A.my_fma_flags) {
	sprintf(why, "My flags are incorrect in map");
	return -1;
      }
    }
  }

  fma_log("map seems OK");
  return 0;
}

struct lf_host *
fma_find_host_by_max_mac_addr(
  lf_mac_addr_t max_mac_addr)
{
  struct lf_fabric *fp;
  int h;

  fp = A.fabric;

  /* if no map, then definitely no host to be found */
  if (fp == NULL) return NULL;

  /* look through all hosts for a match */
  for (h=0; h<fp->num_hosts; ++h) {
    struct lf_host *hp;

    hp = fp->hosts[h];
    if (LF_MAC_CMP(FMA_HOST(hp)->max_mac_addr, max_mac_addr) == 0) {
      return hp;
    }
  }
  return NULL;
}

/*
 * Map is invalid - call the right routine
 */
void
fma_map_is_invalid(
  int use_me,
  char *why)
{
  if (A.debug) {
    fma_log("Map version %d invalid: %s", A.map_info->mi_map_version, why);
  }

  if (A.run_state == FMA_RUN_STANDALONE) {
    fma_standalone_map_is_invalid(why);
  } else {
    fma_fms_map_is_invalid(A.map_info->mi_map_version, use_me, why);
  }
}

/*
 * Set our current mapping level
 */
void
fma_set_mapping_level(
  int level)
{
  struct fma_standalone_data *sdp;

  sdp = A.stand;
  sdp->my_level = level;

  /* set the level of all probes */
  fma_set_probe_levels(level);

  /* update NIC reply info */
  fma_update_nic_reply_info();
}

/*
 * Got a remap request from another FMA.  If we're an active mapper, consider
 * the request.  If the map version matches our current map version and map
 * grace timer is in effect, clear the timer and reset map in progress.
 */
void
fma_peer_remap_request(
  struct fma_map_request *reqp,
  lf_mac_addr_t sender_mac)
{
  struct fma_standalone_data *sdp;
  int bad_map_version;

  sdp = A.stand;

  if (A.debug) {
    fma_log("%s requests remap: %s", fma_mac_to_hostname(sender_mac),
	reqp->why);
  }

  if (A.run_state == FMA_RUN_STANDALONE) {
    if (A.stand->active) {

      /* If debug set, this has already been printed */
      if (!A.debug) {
	fma_log("%s requests remap: %s", fma_mac_to_hostname(sender_mac),
	  reqp->why);
      }

      if (A.debug) fma_log("Following peer request to map fabric");

      /* If map version matches our current version and we are not really
       * actively mapping, force a remap.
       */
      bad_map_version = ntohl(reqp->map_version_32);
      if (bad_map_version == A.map_info->mi_map_version
	  && !sdp->mapping_in_progress) {
	if (A.debug) {
	  fma_log("Current map version noted invalid, force remap");
	}

	/* Cancel any map distribution */
	fma_cancel_map_distribution();
	fma_reset_mapping_done();
      }

      fma_start_mapping_fabric(fma_standalone_map_done);
    } else {
      if (A.debug) {
	fma_log("Ignoring peer request to map fabric: not active");
      }
    }
  } else if (A.debug) {
    fma_log("Ignoring map request from peer while not standalone");
  }
}

/*
 * Record who the new mapper is
 */
void
fma_set_mapper(
  lf_mac_addr_t mac_addr,
  int level)
{
  struct fma_map_info *mip;

  mip = A.map_info;

  /* Report if mapper changing */
  if (LF_MAC_CMP(mip->mi_mapper_mac_addr, mac_addr) != 0 ||
      mip->mi_mapper_level != level) {
    struct lf_nic *mnicp;
    lf_string_t whost;
    lf_string_t nhost;

    /* Make a string describing old mapper */
    if (mip->mi_mapper_hostname[0] != '\0') {
      sprintf(whost, "%s ("LF_MAC_FORMAT")", mip->mi_mapper_hostname,
	  LF_MAC_ARGS(mip->mi_mapper_mac_addr));
    } else {
      sprintf(whost, LF_MAC_FORMAT, LF_MAC_ARGS(mip->mi_mapper_mac_addr));
    }

    /* update mapper hostname string */
    mnicp = lf_find_nic_by_mac(A.fabric, mac_addr);
    if (mnicp != NULL
	&& mnicp->host != NULL
	&& mnicp->host->hostname != NULL
	&& mnicp->host->hostname[0] != '\0') {
      sprintf(nhost, "%s ("LF_MAC_FORMAT")", mnicp->host->hostname,
	  LF_MAC_ARGS(mac_addr));
      strcpy(mip->mi_mapper_hostname, mnicp->host->hostname);
    } else {
      sprintf(nhost, LF_MAC_FORMAT, LF_MAC_ARGS(mac_addr));
      mip->mi_mapper_hostname[0] = '\0';
    }

    fma_log("Mapper was %s, l=%d, is now %s, l=%d",
	whost, mip->mi_mapper_level, nhost, level);
  }

  /* Save this as address of new mapping winner */
  mip->mi_mapper_level = level;
  LF_MAC_COPY(mip->mi_mapper_mac_addr, mac_addr);

  /* update NIC reply info */
  fma_update_nic_reply_info();
}

/*
 * Assign each node to get a map from someone.  This will be included in the
 * map that is distributed so that every recipient will know to whom they
 * need to relay the map.
 */
void
fma_assign_dist_targets(
  struct lf_fabric *fp)
{
  struct lf_host *myhp;
  struct lf_host *hp;
  int **src_table;
  int *src_fan;
  int *src_cnt;
  int *src_next_index;
  int fanout;
  int sf_id;
  int num_sf;
  int si;
  int i;
  int h;
  int t;

  myhp = fp->hosts[0];		/* known to be true at this time */

  /* turn off distribution to me (no such host) */
  myhp->distributor = fp->num_hosts;

  /* Count number of subfabrics */
  num_sf = 0;
  t = myhp->subfabric_mask;
  while (t > 0) {
    t >>= 1;
    ++num_sf;
  }

  /* compute number of nodes each distributor should send to (~logN) */
  fanout = 0;
  t = fp->num_hosts;
  while (t > 0) {
    ++fanout;
    t >>= 1;
  }
  fanout = 20;	/* hardcode for the moment */

  /* if no subfabrics, all done! */
  if (num_sf == 0) return;

  /* allocate a table for each subfabric */
  LF_CALLOC(src_cnt, int, num_sf);
  LF_CALLOC(src_fan, int, num_sf);
  LF_CALLOC(src_next_index, int, num_sf);
  LF_CALLOC(src_table, int *, num_sf);
  for (i=0; i<num_sf; ++i) {
    LF_CALLOC(src_table[i], int, fp->num_hosts);
  }

  /* Put self on all lists */
  for (i=0; i<num_sf; ++i) {
    src_table[i][0] = myhp->host_index;
    src_cnt[i] = 1;
  }

  /* Scan through all hosts in fabric and assign distribution targets */
  for (h=1; h<fp->num_hosts; ++h) {

    hp = fp->hosts[h];

    /*
     * find a distributor for this node.  First we find the lowest subfabric
     * of this it is a member, then take the next distributor for that
     * subfabric.
     */
    t = hp->subfabric_mask;
    sf_id = 0;
    while (t > 1) {
      t >>= 1;
      ++sf_id;
    }

    /* This happens sometimes when i mess up in the simulator */
    if (sf_id >= num_sf) {
      if (A.debug) {
	fma_log("Impossible sf_id = %d, num_sf=%d\n", sf_id, num_sf);
      }
      continue;
    }

    /* take the top distributor from the list for this subfabric */
    si = src_next_index[sf_id];
    hp->distributor = src_table[sf_id][si];

    /* increment and wrap the next index if fanout reached */
    ++src_fan[sf_id];
    if (src_fan[sf_id] >= fanout) {
      ++si;
      if (si >= src_cnt[sf_id]) {
	si = 0;
      }
      src_next_index[sf_id] = si;
      src_fan[sf_id] = 0;
    }

    /* If this node can distribute, add it to all member subfabric lists */
    if (hp->fma_flags & FMA_FLAG_CAN_DISTRIBUTE) {
      for (i=0; i<num_sf; ++i) {

	/* If we are in this subfabric, become a potential distributor */
	if ((hp->subfabric_mask & (1<<i)) != 0) {
	  src_table[i][src_cnt[i]] = h;
	  ++src_cnt[i];
	}
      }
    }
  }

  /* free all of our allocated data */
  for (i=0; i<num_sf; ++i) {
    LF_FREE(src_table[i]);
  }
  LF_FREE(src_table);
  LF_FREE(src_cnt);
  LF_FREE(src_fan);
  LF_FREE(src_next_index);
  return;

 except:
  fma_perror_exit(1);
}

/*
 * Distribute this topo map to other hosts in the topo map.
 * Distribute only once to each host, even where hosts have multiple NICs
 * Every host has a host_index, H.  Our host index is H0.  
 * We distribute to all other H generated by flipping a '0' in H0 to be a 1,
 * i.e. all hosts whose host_index differs from ours by 1 bit, and always 0 -> 1.
 * We also assume the responsibility for all distributions to hosts downstream
 * from one of out targets when our target is incapable of distributing.
 */
void
fma_distribute_topo_map(
  void (*completion_rtn)(void))
{
  struct lf_fabric *fp;
  struct fma_map_info *mip;
  struct lf_host *hp;
  int dist_cnt;
  int my_id;
  int h;

  fp = A.fabric;			/* our fabric */
  mip = A.map_info;

  /* allocate space for the distribution list */
  LF_FREE(mip->mi_dist_list);
  LF_CALLOC(mip->mi_dist_list, int, fp->num_hosts);

  /* save completion routine */
  mip->mi_dist_complete_rtn = completion_rtn;

  /* Generate our distribution list */
  my_id = A.my_host->host_index;

  /* find all hosts which have us as distributor */
  dist_cnt = 0;
  for (h=0; h<fp->num_hosts; ++h) {
    hp = fp->hosts[h];
    if (hp->distributor == my_id) {
      mip->mi_dist_list[dist_cnt] = h;
      ++dist_cnt;
    }
  }
  mip->mi_dist_cnt = dist_cnt;

  if (A.debug) {
    fma_log("Distributing topo map to %d hosts", mip->mi_dist_cnt);
  }

  /* start next (first) map sends */
  mip->mi_next_map_send = 0;
  mip->mi_dist_total = dist_cnt;
  mip->mi_dist_failed = 0;
  mip->mi_dist_global_remaining = fp->num_hosts - 1;
  mip->mi_dist_start_time = time(NULL);
  fma_next_map_sends();

  /* If nothing distributed, then we are done already! */
  if (mip->mi_map_sends_pending <= 0) {
    if (mip->mi_dist_complete_rtn != NULL) {
      mip->mi_dist_complete_rtn();
    }
  }
  return;

 except:
  fma_perror_exit(1);
}

static void
fma_next_map_sends()
{
  struct lf_fabric *fp;
  struct lf_host *hp;
  struct fma_host *fhp;
  struct fma_map_info *mip;
  int to_id;

  fp = A.fabric;			/* our fabric */
  mip = A.map_info;

  /* keep sending while something to do and more sends allowed */
  while (mip->mi_next_map_send < mip->mi_dist_cnt
         && mip->mi_map_sends_pending < FMA_MAX_MAP_SENDS_PENDING) {

    /* Next host to which to send */
    to_id = mip->mi_dist_list[mip->mi_next_map_send];
    hp = fp->hosts[to_id];

    /* Skip anyone who cannot route, and anyone who has FMS */
    if (FMA_HOST(hp)->quick_route_len < 0) {
      if (A.debug) {
	fma_log("No route to host %d: %s for map dist", to_id,
	    fma_mac_to_hostname(hp->nics[0]->mac_addr));
      }
    } else if ((hp->fma_flags & (FMA_FLAG_CAN_ROUTE|FMA_FLAG_HAS_FMS))
	       != FMA_FLAG_CAN_ROUTE) {
      if (A.debug > 2) {
	fma_log("Skipping host %d: %s, flags = %x", to_id,
	    fma_mac_to_hostname(hp->nics[0]->mac_addr), hp->fma_flags);
      }

    } else {

      /* Send it */
      fhp = FMA_HOST(hp);

      if (A.debug) fma_log("Start map dist to %d \"%s\"", to_id, hp->hostname);

      /* Send a map to this host */
      FMA_HOST(hp)->map_send_id = fma_tunnel_send(
	    &mip->topo_map_hdr, sizeof(mip->topo_map_hdr),
	    mip->current_topo_map, mip->current_topo_map_size,
	    fhp->quick_route, fhp->quick_route_len,
	    fhp->quick_nic, fhp->quick_port,
	    fma_dist_map_done,
	    fma_dist_map_failed,
	    (void *) hp);

      /* increment pending count */
      ++mip->mi_map_sends_pending;
    }

    ++mip->mi_next_map_send;	/* next host in list */
  }
}

static void
fma_dist_map_done(
  void *vhp)
{
  struct fma_map_info *mip;
  struct lf_host *hp;

  mip = A.map_info;
  hp = vhp;

  if (A.debug) {
    fma_log("Map distribution to %d: %s complete",
	hp->host_index, fma_mac_to_hostname(hp->nics[0]->mac_addr));
  }

  --mip->mi_map_sends_pending;
  FMA_HOST(hp)->map_send_id = 0;

  /* send another if we can */
  fma_next_map_sends();

  /* If the map has been successfully sent to everyone, free it */
  if (mip->mi_map_sends_pending <= 0) {
    if (mip->mi_dist_complete_rtn != NULL) {
      mip->mi_dist_complete_rtn();
    }
  }
}

static void
fma_dist_map_failed(
  void *vhp)
{
  struct fma_map_info *mip;
  struct lf_host *hp;

  mip = A.map_info;
  hp = vhp;

  fma_log("Map distribution to %d: %s failed?", hp->host_index,
      fma_mac_to_hostname(hp->nics[0]->mac_addr));

  ++mip->mi_dist_failed;		/* count the failures */

  --mip->mi_map_sends_pending;
  FMA_HOST(hp)->map_send_id = 0;

  /* send another if we can */
  fma_next_map_sends();

  /* If the map has been successfully sent to everyone, free it */
  if (mip->mi_map_sends_pending <= 0) {
    if (mip->mi_dist_complete_rtn != NULL) {
      mip->mi_dist_complete_rtn();
    }
  }
}

/*
 * Cancel any map distributions in progress
 */
void
fma_cancel_map_distribution()
{
  struct lf_fabric *fp;
  struct lf_host *hp;
  int h;

  fp = A.fabric;

  /* If no fabric, nothing to distribute */
  if (fp == NULL) return;

  /* cancel any map sends in progress on each host */
  for (h=0; h<fp->num_hosts; ++h) {
    hp = fp->hosts[h];
    if (FMA_HOST(hp)->map_send_id != 0) {
      fma_tunnel_cancel_send(FMA_HOST(hp)->map_send_id);
      FMA_HOST(hp)->map_send_id = 0;
    }
  }
}

void
fma_dumpit()
{
  struct lf_fabric *fp;
  struct lf_host *hp;
  struct lf_nic *nicp;
  int i;
  int n;
  int x;
  lf_string_t s;
  int off;

  fp = A.fabric;
  printf("Fabric loaded, %d hosts\n", fp->num_hosts);

  for (i=0; i<fp->num_hosts; ++i) {
    hp = fp->hosts[i];
    if (hp == NULL) { fma_log("%d: null", i); continue; }

    fma_log("\"%s\", %d NICs, sf_mask=%x, dist=%d, qrlen=%d", hp->hostname,
	hp->num_nics, hp->subfabric_mask, hp->distributor,
	FMA_HOST(hp)->quick_route_len);

    for (n=0; n<hp->num_nics;++n) {
      nicp = hp->nics[n];

      fma_log("  NIC %d, " LF_MAC_FORMAT ", flags=%x, mag_id=%d",
	  nicp->host_nic_id, LF_MAC_ARGS(nicp->mac_addr),
	  hp->fma_flags, nicp->mag_id);
    }
  }

  fma_log("%d xbars", fp->num_xbars);
  for (x=0; x<fp->num_xbars; ++x) {
    struct lf_xbar *xp;
    int p;

    xp = fp->xbars[x];
    fma_log("      xbar[%d] - id=%d, %d ports, qd=%x, clos=%d, d=%d",
	x, xp->xbar_id, xp->num_ports, xp->quadrant_disable,
	xp->clos_level, FMA_XBAR(xp)->xbar_dist[0]);
	
    for (p=0; p<xp->num_ports; ++p) {
      union lf_node *onp;

      off = sprintf(s, "       %d -", p);

      onp = xp->topo_ports[p];
      if (onp == NULL) {
	fma_log(s);
	continue;
      }

      if (onp->ln_type == LF_NODE_XBAR) {
	struct lf_xbar *oxp;

	oxp = LF_XBAR(onp);
	off += sprintf(s+off, " x%d:%d, clos=%d, d=%d", FMA_XBAR(oxp)->node_id,
		xp->topo_rports[p], oxp->clos_level,
		FMA_XBAR(oxp)->xbar_dist[0]);


      } else if (onp->ln_type == LF_NODE_NIC) {
	off += sprintf(s+off, " %s", lf_node_string(onp, xp->topo_rports[p]));
	if (xp->clos_level != 1) off += sprintf(s+off, " XXXX");
      } else {
	off += sprintf(s+off, " ????");
      }

      off += sprintf(s+off, ")");
      fma_log(s);
    }
  }
}
